library(tidyverse)
library(mice)
library(skimr)
library(corrplot)
library(car)
library(ISLR)
library(ggplot2)
library(gridExtra)
library(SamplingStrata)
library(rbin)
library(leaps)

Objective 1:

Question of Interest: what variables are used to predict price of a NYC Airbnb

nycraw <- read.csv("https://raw.githubusercontent.com/JaclynCoate/6372_Project_1/master/AB_NYC_2019.csv", header = TRUE, strip.white=TRUE)
head(nycraw)
##     id                                             name host_id
## 1 2539               Clean & quiet apt home by the park    2787
## 2 2595                            Skylit Midtown Castle    2845
## 3 3831                  Cozy Entire Floor of Brownstone    4869
## 4 5022 Entire Apt: Spacious Studio/Loft by central park    7192
## 5 5099        Large Cozy 1 BR Apartment In Midtown East    7322
## 6 5121                                  BlissArtsSpace!    7356
##     host_name neighbourhood_group      neighbourhood latitude longitude
## 1        John            Brooklyn         Kensington 40.64749 -73.97237
## 2    Jennifer           Manhattan            Midtown 40.75362 -73.98377
## 3 LisaRoxanne            Brooklyn       Clinton Hill 40.68514 -73.95976
## 4       Laura           Manhattan        East Harlem 40.79851 -73.94399
## 5       Chris           Manhattan        Murray Hill 40.74767 -73.97500
## 6       Garon            Brooklyn Bedford-Stuyvesant 40.68688 -73.95596
##         room_type price minimum_nights number_of_reviews last_review
## 1    Private room   149              1                 9    10/19/18
## 2 Entire home/apt   225              1                45     5/21/19
## 3 Entire home/apt    89              1               270      7/5/19
## 4 Entire home/apt    80             10                 9    11/19/18
## 5 Entire home/apt   200              3                74     6/22/19
## 6    Private room    60             45                49     10/5/17
##   reviews_per_month calculated_host_listings_count availability_365
## 1              0.21                              6              365
## 2              0.38                              2              355
## 3              4.64                              1              194
## 4              0.10                              1                0
## 5              0.59                              1              129
## 6              0.40                              1                0
str(nycraw)
## 'data.frame':    34464 obs. of  16 variables:
##  $ id                            : int  2539 2595 3831 5022 5099 5121 5178 5203 5238 5295 ...
##  $ name                          : Factor w/ 34000 levels ""," Private 1 bdrm Lefferts Gr, BK apt",..: 8990 27105 11178 13776 17775 5839 17808 11099 12643 3931 ...
##  $ host_id                       : int  2787 2845 4869 7192 7322 7356 8967 7490 7549 7702 ...
##  $ host_name                     : Factor w/ 9124 levels "","​ Valéria",..: 4017 3840 4984 4754 1544 2840 7735 5519 958 4836 ...
##  $ neighbourhood_group           : Factor w/ 5 levels "Bronx","Brooklyn",..: 2 3 2 3 3 2 3 3 3 3 ...
##  $ neighbourhood                 : Factor w/ 218 levels "Allerton","Arden Heights",..: 108 127 42 62 137 14 95 201 36 201 ...
##  $ latitude                      : num  40.6 40.8 40.7 40.8 40.7 ...
##  $ longitude                     : num  -74 -74 -74 -73.9 -74 ...
##  $ room_type                     : Factor w/ 3 levels "Entire home/apt",..: 2 1 1 1 1 2 2 2 1 1 ...
##  $ price                         : int  149 225 89 80 200 60 79 79 150 135 ...
##  $ minimum_nights                : int  1 1 1 10 3 45 2 2 1 5 ...
##  $ number_of_reviews             : int  9 45 270 9 74 49 430 118 160 53 ...
##  $ last_review                   : Factor w/ 908 levels "1/1/17","1/1/18",..: 113 575 775 175 671 144 677 745 716 671 ...
##  $ reviews_per_month             : num  0.21 0.38 4.64 0.1 0.59 0.4 3.47 0.99 1.33 0.43 ...
##  $ calculated_host_listings_count: int  6 2 1 1 1 1 1 1 4 1 ...
##  $ availability_365              : int  365 355 194 0 129 0 220 0 188 6 ...

EDA to determine type of multiple linear regression to perform

Removing logically irrelevant variables

#Dropping logical irrelevant variables: "id", "name", "host_id", "host_name", "last_reiview", "latitude", "longitude", "neighborhood","availability_365"
nyc2 <- select(nycraw, -c("id", "name", "host_id", "host_name", "last_review", "latitude", "longitude", "neighbourhood","availability_365"))
head(nyc2)
##   neighbourhood_group       room_type price minimum_nights
## 1            Brooklyn    Private room   149              1
## 2           Manhattan Entire home/apt   225              1
## 3            Brooklyn Entire home/apt    89              1
## 4           Manhattan Entire home/apt    80             10
## 5           Manhattan Entire home/apt   200              3
## 6            Brooklyn    Private room    60             45
##   number_of_reviews reviews_per_month calculated_host_listings_count
## 1                 9              0.21                              6
## 2                45              0.38                              2
## 3               270              4.64                              1
## 4                 9              0.10                              1
## 5                74              0.59                              1
## 6                49              0.40                              1

Dependent Variable Check

  • Checking on dependent variable range to make sure if there are zero’s to remove. It would not be free to stay in NYC.
nyc2 <- nyc2[!(nyc2$price==0),]
invisible(view(nyc2))

NA Evaluation and Drop

#Checking for NAs
md.pattern(nyc2)
##  /\     /\
## {  `---'  }
## {  O   O  }
## ==>  V <==  No need for mice. This data set is completely observed.
##  \  \|/  /
##   `-----'

##       neighbourhood_group room_type price minimum_nights number_of_reviews
## 34454                   1         1     1              1                 1
##                         0         0     0              0                 0
##       reviews_per_month calculated_host_listings_count  
## 34454                 1                              1 0
##                       0                              0 0
nrow(nyc2)
## [1] 34454
#Drop NAs that are present
nyc3 <- na.omit(nyc2)
#Confirming NA drop
nrow(nyc3)
## [1] 34454

Zero variance variable check - all show variance so remain in model

#Results show no zero variance variables, leave in all
skim(nyc3)
## Skim summary statistics
##  n obs: 34454 
##  n variables: 7 
## 
## ── Variable type:factor ────────────────────────────────────────────────────────
##             variable missing complete     n n_unique
##  neighbourhood_group       0    34454 34454        5
##            room_type       0    34454 34454        3
##                                   top_counts ordered
##  Bro: 14552, Man: 14436, Que: 4304, Bro: 854   FALSE
##      Ent: 17919, Pri: 15752, Sha: 783, NA: 0   FALSE
## 
## ── Variable type:integer ───────────────────────────────────────────────────────
##                        variable missing complete     n   mean     sd p0
##  calculated_host_listings_count       0    34454 34454   5.56  27.76  1
##                  minimum_nights       0    34454 34454   5.78  15.48  1
##               number_of_reviews       0    34454 34454  32.34  50.23  1
##                           price       0    34454 34454 140.73 167.84 10
##  p25 p50 p75  p100     hist
##    1   1   2   327 ▇▁▁▁▁▁▁▁
##    2   2   4   999 ▇▁▁▁▁▁▁▁
##    4  12  39   629 ▇▁▁▁▁▁▁▁
##   68 100 170 10000 ▇▁▁▁▁▁▁▁
## 
## ── Variable type:numeric ───────────────────────────────────────────────────────
##           variable missing complete     n mean   sd   p0 p25  p50  p75
##  reviews_per_month       0    34454 34454 1.53 1.72 0.02 0.3 0.94 2.27
##  p100     hist
##  58.5 ▇▁▁▁▁▁▁▁

Storing all categorical variables as factors

#Storing categorical variables as factors
skim(nyc3)
## Skim summary statistics
##  n obs: 34454 
##  n variables: 7 
## 
## ── Variable type:factor ────────────────────────────────────────────────────────
##             variable missing complete     n n_unique
##  neighbourhood_group       0    34454 34454        5
##            room_type       0    34454 34454        3
##                                   top_counts ordered
##  Bro: 14552, Man: 14436, Que: 4304, Bro: 854   FALSE
##      Ent: 17919, Pri: 15752, Sha: 783, NA: 0   FALSE
## 
## ── Variable type:integer ───────────────────────────────────────────────────────
##                        variable missing complete     n   mean     sd p0
##  calculated_host_listings_count       0    34454 34454   5.56  27.76  1
##                  minimum_nights       0    34454 34454   5.78  15.48  1
##               number_of_reviews       0    34454 34454  32.34  50.23  1
##                           price       0    34454 34454 140.73 167.84 10
##  p25 p50 p75  p100     hist
##    1   1   2   327 ▇▁▁▁▁▁▁▁
##    2   2   4   999 ▇▁▁▁▁▁▁▁
##    4  12  39   629 ▇▁▁▁▁▁▁▁
##   68 100 170 10000 ▇▁▁▁▁▁▁▁
## 
## ── Variable type:numeric ───────────────────────────────────────────────────────
##           variable missing complete     n mean   sd   p0 p25  p50  p75
##  reviews_per_month       0    34454 34454 1.53 1.72 0.02 0.3 0.94 2.27
##  p100     hist
##  58.5 ▇▁▁▁▁▁▁▁

Checking for Multicollinearity

  • Multicollinearity will weaken the model
    • number_of_reviews and reviews_per_month are correlated at 55%
      • Removing reviews_per_month
corrNYC <- nyc3
#Table numeric variables
corrNYCTable <- corrNYC %>% keep(is.numeric) %>% cor %>% view
#Plot numeric variables v numeric variables
corrNYC %>% keep(is.numeric) %>% cor %>% corrplot("upper", addCoef.col = "white", number.digits = 2, number.cex = 0.5, method="square", order="hclust", tl.srt=45, tl.cex = 0.8)

invisible(view(corrNYCTable))
#Removing reviews_per_month due to high correlation of is and number_of_reviews
nyc4 <- select(nyc3, -c("reviews_per_month"))

Summary Review of Data Set

summary(nyc4)
##     neighbourhood_group           room_type         price        
##  Bronx        :  854    Entire home/apt:17919   Min.   :   10.0  
##  Brooklyn     :14552    Private room   :15752   1st Qu.:   68.0  
##  Manhattan    :14436    Shared room    :  783   Median :  100.0  
##  Queens       : 4304                            Mean   :  140.7  
##  Staten Island:  308                            3rd Qu.:  170.0  
##                                                 Max.   :10000.0  
##  minimum_nights    number_of_reviews calculated_host_listings_count
##  Min.   :  1.000   Min.   :  1.00    Min.   :  1.000               
##  1st Qu.:  2.000   1st Qu.:  4.00    1st Qu.:  1.000               
##  Median :  2.000   Median : 12.00    Median :  1.000               
##  Mean   :  5.777   Mean   : 32.34    Mean   :  5.558               
##  3rd Qu.:  4.000   3rd Qu.: 39.00    3rd Qu.:  2.000               
##  Max.   :999.000   Max.   :629.00    Max.   :327.000

Removing outliers from minimum nights stay

  • Anything over 365 is more than a year and would be improbable
  • Removing any minimum nights metric over 365
nyc4 <- nyc4[!(nyc4$minimum_nights > 365),]
invisible(view(nyc4))

Examining VIFs

  • The below results show us there is no need to remove any variables
full.model<-lm(price~.,data=nyc4)  # . means all variable not mpg
vif(full.model)[,3]^2
##            neighbourhood_group                      room_type 
##                       1.011167                       1.018374 
##                 minimum_nights              number_of_reviews 
##                       1.029835                       1.014416 
## calculated_host_listings_count 
##                       1.031679
alias(lm(price~.,data=nyc4))
## Model :
## price ~ neighbourhood_group + room_type + minimum_nights + number_of_reviews + 
##     calculated_host_listings_count

Reviewing Linearity with Numeric Variables

  • Curved relationships with the numeric variables
    • Could require a quadratic or logarithmic transformation
#nyc4 %>% pairs() No color model
pairs(nyc4,col=nyc4$neighbourhood_group) #Color by neighborhood

par(mfrow=c(2,2))
plot(full.model)

Creating new Log price variable

  • Based on the above plots we may benefit from a transformation
    • Log transforming price to create a log-linear regression
log.nyc <- nyc4 %>% mutate(lprice=log(price))
log.nyc <- select(log.nyc, -c("price"))
invisible(log.nyc)

Examining VIFs of Log Price Variable

log.depend.model<-lm(lprice~.,data=log.nyc)  # . means all variable not mpg
vif(log.depend.model)[,3]^2
##            neighbourhood_group                      room_type 
##                       1.011167                       1.018374 
##                 minimum_nights              number_of_reviews 
##                       1.029835                       1.014416 
## calculated_host_listings_count 
##                       1.031679
alias(lm(lprice~.,data=log.nyc))
## Model :
## lprice ~ neighbourhood_group + room_type + minimum_nights + number_of_reviews + 
##     calculated_host_listings_count

Reviewing Linearity with Independent and Logged Dependent (Price) Variable

  • Curved relationships with the numeric variables
    • Could require a quadratic or logarithmic transformation
pairs(log.nyc,col=log.nyc$neighbourhood_group)

par(mfrow=c(2,2))
plot(log.depend.model)

Log-log model

  • Due to lack of linearity trying to transform the independent variables to see if we can surface a linear relationship
log.indep.nyc <- log.nyc %>% mutate(lreviews=log(number_of_reviews))
log.indep.nyc <- log.indep.nyc %>% mutate(lnights=log(minimum_nights))
log.indep.nyc <- log.indep.nyc %>% mutate(llistings=log(calculated_host_listings_count))
invisible(log.indep.nyc)

log.indep.nyc <- select(log.indep.nyc, -c("minimum_nights", "number_of_reviews", "calculated_host_listings_count"))
invisible(log.indep.nyc)

Examining VIFs of Log-Log Model

log.indep.model<-lm(lprice~.,data=log.indep.nyc)  # . means all variable not mpg
vif(log.indep.model)[,3]^2
## neighbourhood_group           room_type            lreviews 
##            1.012151            1.047283            1.057203 
##             lnights           llistings 
##            1.169617            1.084210
alias(lm(lprice~.,data=log.indep.nyc))
## Model :
## lprice ~ neighbourhood_group + room_type + lreviews + lnights + 
##     llistings

Reviewing Linearity with Logged Independent and Dependent Variables

  • Curved relationships with the numeric variables
    • Could require a quadratic or logarithmic transformation
pairs(log.indep.nyc,col=log.indep.nyc$neighbourhood_group) #Color by neighborhood

par(mfrow=c(2,2))
plot(log.indep.model)

Continuous Variable Manipulation

  • Since we are seeing large clouds of data but no linear trend with logged and unlogged data, we are going to move forward with binning the data to see if it will assist us in determining if there is a relationship between the continuous variables and log price
nyc5 <- rbin_winsorize(nyc4, price, number_of_reviews, 50, winsor_rate = 0.05)
nyc5
## Binning Summary
## ------------------------------
## Method               Winsorize 
## Response             price 
## Predictor            number_of_reviews 
## Bins                 50 
## Count                34450 
## Goods                0 
## Bads                 0 
## Entropy              NaN 
## Information Value    NaN 
## 
## 
## # A tibble: 50 x 7
##    cut_point bin_count  good   bad   woe    iv entropy
##    <chr>         <int> <int> <int> <dbl> <dbl>   <dbl>
##  1 < 3.68         8406     0     0   NaN   NaN     NaN
##  2 < 6.36         4303     0     0   NaN   NaN     NaN
##  3 < 9.04         2959     0     0   NaN   NaN     NaN
##  4 < 11.72        1454     0     0   NaN   NaN     NaN
##  5 < 14.4         1740     0     0   NaN   NaN     NaN
##  6 < 17.08        1395     0     0   NaN   NaN     NaN
##  7 < 19.76         774     0     0   NaN   NaN     NaN
##  8 < 22.44        1034     0     0   NaN   NaN     NaN
##  9 < 25.12         922     0     0   NaN   NaN     NaN
## 10 < 27.8          535     0     0   NaN   NaN     NaN
## # … with 40 more rows
nyc4 %>% keep(is.numeric) %>% pairs() #[Add color]

par(mfrow=c(2,2))
plot(full.model)

Continuous Variable Bin Manipulation

  • Since we are seeing large clouds of data but no linear trend with logged and unlogged data, we are going to move forward with binning the data to see if it will assist us in determining if there is a relationship between the continuous variables and log price
nyc.bins <- nyc4

nyc.bins$reviewsBin <- var.bin(nyc.bins$number_of_reviews, bins = 50)
nyc.bins$nightsBin <- var.bin(nyc.bins$minimum_nights, bins = 50)
nyc.bins$listBin <- var.bin(nyc.bins$calculated_host_listings_count, bins = 10)

nyc.bins <- select(nyc.bins,-c("minimum_nights", "number_of_reviews", "calculated_host_listings_count"))
invisible(nyc.bins)

Reviewing Linearity with Binned Indepedent Variables

  • No linearity is presenting itself with a binned approach of the independent variables
nyc.bin.model <-lm(price~.,data=nyc.bins)
#nyc.bins  %>% pairs() No color model
pairs(nyc.bins,col=nyc.bins$neighbourhood_group) #Color by neighborhood

par(mfrow=c(2,2))
plot(nyc.bin.model)

Explore potential correlation Neighborhood v Price

  • We have to this moment not be able to surface linearity relationships between our numerican independent varaibles and our dependt variable
  • Next we will check for correltaion of the categorical variables: room_type & neighbourhood_group
  • Without removing the ouliers of price above 400 it is near impossible to see if there is a difference per neighborhood. We have removed those prices above 400 to see if the
  • We see a strong chance of correlation between Price and Neighbourhood Group
nyc.categorical <- nyc4[!(nyc4$price > 400),]
nrow(nyc4)
## [1] 34450
nrow(nyc.categorical)
## [1] 33468
plot(nyc.categorical$neighbourhood_group, nyc.categorical$price, xlab = "Neighbourhood Group", ylab = "Price", title = "Price v Neighbourhood Group Correlation Check", col=c(7,32,52,82,107)) 

Explore potential correlation Room Type v Price

  • We see a strong chance of corerlation between Price and Room Type
plot(nyc.categorical$room_type, nyc.categorical$price, xlab = "Room Type", ylab = "Price", title = "Price v Room Type Correlation Check", col=c(7,32,52)) 

Reviewing Linearity with Numeric Variables w/ Price > 400 Outliers Removed

nyc.cat.model<-lm(price~.,data=nyc.categorical) 
#nyc4 %>% pairs() No color model
pairs(nyc.categorical,col=nyc.categorical$neighbourhood_group) #Color by neighborhood

par(mfrow=c(2,2))
plot(nyc.cat.model)

Reviewing Linearity with Numeric Variables w/ Log-Linear model Price > 400 Outliers Removed

  • Still no obvious linearity
log.nyc.outliers <- log.nyc[!(log.nyc$lprice > 400),]

log.nyc.outliers.model<-lm(lprice~.,data=log.nyc.outliers) 
pairs(log.nyc.outliers, col=log.nyc.outliers$neighbourhood_group) #Color by neighborhood

par(mfrow=c(2,2))
plot(log.nyc.outliers.model)

Modeling

  • We are not seeing any linear correlation between the dependent and independent numeric varaibles
  • We are seeing a strong chance of linear correlation between the dependent and independent categorical variables
  • We have surfaced the best residuals assumptions matched in a log-linear model
    • Due to this we are moving forward with modeling a log-linear model with singular variables as well as all interaction terms
    • This is to add complexity to our model, we have a low number of varaibles to select from
      • In adding this complexity we are tryign to surface any possible linear variable interations that may contribute to our model
      • If these are surfaced we will go back and use graphical means to verify the model’s discovery
nyc.model = lm(lprice~neighbourhood_group + room_type + neighbourhood_group:room_type + minimum_nights + number_of_reviews + calculated_host_listings_count + minimum_nights:number_of_reviews + minimum_nights:calculated_host_listings_count + number_of_reviews:calculated_host_listings_count, data=log.nyc.outliers)
summary(nyc.model)
## 
## Call:
## lm(formula = lprice ~ neighbourhood_group + room_type + neighbourhood_group:room_type + 
##     minimum_nights + number_of_reviews + calculated_host_listings_count + 
##     minimum_nights:number_of_reviews + minimum_nights:calculated_host_listings_count + 
##     number_of_reviews:calculated_host_listings_count, data = log.nyc.outliers)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9833 -0.2917 -0.0296  0.2440  4.7328 
## 
## Coefficients:
##                                                          Estimate
## (Intercept)                                             4.692e+00
## neighbourhood_groupBrooklyn                             3.260e-01
## neighbourhood_groupManhattan                            6.047e-01
## neighbourhood_groupQueens                               1.436e-01
## neighbourhood_groupStaten Island                        4.720e-03
## room_typePrivate room                                  -7.062e-01
## room_typeShared room                                   -1.054e+00
## minimum_nights                                         -3.461e-03
## number_of_reviews                                      -2.185e-04
## calculated_host_listings_count                          1.530e-03
## neighbourhood_groupBrooklyn:room_typePrivate room      -1.194e-01
## neighbourhood_groupManhattan:room_typePrivate room     -5.599e-02
## neighbourhood_groupQueens:room_typePrivate room        -2.007e-02
## neighbourhood_groupStaten Island:room_typePrivate room -1.116e-02
## neighbourhood_groupBrooklyn:room_typeShared room       -3.072e-01
## neighbourhood_groupManhattan:room_typeShared room      -6.728e-02
## neighbourhood_groupQueens:room_typeShared room         -6.447e-02
## neighbourhood_groupStaten Island:room_typeShared room   1.826e-02
## minimum_nights:number_of_reviews                        1.906e-05
## minimum_nights:calculated_host_listings_count          -5.631e-05
## number_of_reviews:calculated_host_listings_count       -3.360e-05
##                                                        Std. Error t value
## (Intercept)                                             2.703e-02 173.588
## neighbourhood_groupBrooklyn                             2.748e-02  11.862
## neighbourhood_groupManhattan                            2.743e-02  22.050
## neighbourhood_groupQueens                               2.934e-02   4.896
## neighbourhood_groupStaten Island                        4.732e-02   0.100
## room_typePrivate room                                   3.401e-02 -20.763
## room_typeShared room                                    7.717e-02 -13.659
## minimum_nights                                          2.244e-04 -15.420
## number_of_reviews                                       6.511e-05  -3.355
## calculated_host_listings_count                          1.723e-04   8.879
## neighbourhood_groupBrooklyn:room_typePrivate room       3.490e-02  -3.422
## neighbourhood_groupManhattan:room_typePrivate room      3.498e-02  -1.601
## neighbourhood_groupQueens:room_typePrivate room         3.714e-02  -0.541
## neighbourhood_groupStaten Island:room_typePrivate room  6.372e-02  -0.175
## neighbourhood_groupBrooklyn:room_typeShared room        8.243e-02  -3.727
## neighbourhood_groupManhattan:room_typeShared room       8.168e-02  -0.824
## neighbourhood_groupQueens:room_typeShared room          8.714e-02  -0.740
## neighbourhood_groupStaten Island:room_typeShared room   2.267e-01   0.081
## minimum_nights:number_of_reviews                        4.652e-06   4.098
## minimum_nights:calculated_host_listings_count           8.237e-06  -6.836
## number_of_reviews:calculated_host_listings_count        1.778e-05  -1.889
##                                                        Pr(>|t|)    
## (Intercept)                                             < 2e-16 ***
## neighbourhood_groupBrooklyn                             < 2e-16 ***
## neighbourhood_groupManhattan                            < 2e-16 ***
## neighbourhood_groupQueens                              9.85e-07 ***
## neighbourhood_groupStaten Island                       0.920556    
## room_typePrivate room                                   < 2e-16 ***
## room_typeShared room                                    < 2e-16 ***
## minimum_nights                                          < 2e-16 ***
## number_of_reviews                                      0.000793 ***
## calculated_host_listings_count                          < 2e-16 ***
## neighbourhood_groupBrooklyn:room_typePrivate room      0.000622 ***
## neighbourhood_groupManhattan:room_typePrivate room     0.109489    
## neighbourhood_groupQueens:room_typePrivate room        0.588816    
## neighbourhood_groupStaten Island:room_typePrivate room 0.860993    
## neighbourhood_groupBrooklyn:room_typeShared room       0.000194 ***
## neighbourhood_groupManhattan:room_typeShared room      0.410107    
## neighbourhood_groupQueens:room_typeShared room         0.459457    
## neighbourhood_groupStaten Island:room_typeShared room  0.935790    
## minimum_nights:number_of_reviews                       4.17e-05 ***
## minimum_nights:calculated_host_listings_count          8.26e-12 ***
## number_of_reviews:calculated_host_listings_count       0.058854 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4686 on 34429 degrees of freedom
## Multiple R-squared:  0.5016, Adjusted R-squared:  0.5013 
## F-statistic:  1732 on 20 and 34429 DF,  p-value: < 2.2e-16

Model selection attempts

nyc.fwd = regsubsets(lprice~neighbourhood_group + room_type + neighbourhood_group:room_type + minimum_nights + number_of_reviews + calculated_host_listings_count + minimum_nights:number_of_reviews + minimum_nights:calculated_host_listings_count + number_of_reviews:calculated_host_listings_count, data=log.nyc.outliers, method="forward",nvmax=20)
summary(nyc.fwd)$adjr2
##  [1] 0.3439458 0.4200794 0.4868078 0.4924656 0.4967807 0.4979197 0.4988987
##  [8] 0.4995849 0.5000920 0.5007380 0.5010242 0.5012598 0.5013315 0.5013690
## [15] 0.5013571 0.5013494 0.5013390 0.5013248 0.5013105 0.5012962
summary(nyc.fwd)$rss
##  [1] 9951.173 8796.107 7783.759 7697.722 7632.054 7614.557 7599.489
##  [8] 7588.863 7580.952 7570.937 7566.376 7562.584 7561.278 7560.489
## [15] 7560.451 7560.347 7560.286 7560.281 7560.278 7560.276
summary(nyc.fwd)$bic
##  [1] -14501.19 -18741.22 -22942.97 -23315.43 -23600.14 -23668.76 -23726.55
##  [8] -23764.31 -23789.79 -23824.89 -23835.20 -23842.02 -23837.52 -23830.67
## [15] -23820.40 -23810.42 -23800.25 -23789.83 -23779.40 -23768.96
nyc.fwd2 = regsubsets(lprice~neighbourhood_group + room_type + neighbourhood_group:room_type + minimum_nights + number_of_reviews + calculated_host_listings_count, data=log.nyc.outliers, method="forward",nvmax=50)
summary(nyc.fwd2)$adjr2
##  [1] 0.3439458 0.4200794 0.4868078 0.4924656 0.4967807 0.4979197 0.4988987
##  [8] 0.4994334 0.5000829 0.5003492 0.5004041 0.5003922 0.5003800 0.5003750
## [15] 0.5003609 0.5003467 0.5003322
summary(nyc.fwd2)$rss
##  [1] 9951.173 8796.107 7783.759 7697.722 7632.054 7614.557 7599.489
##  [8] 7591.160 7581.091 7576.831 7575.779 7575.739 7575.705 7575.560
## [15] 7575.555 7575.551 7575.549
summary(nyc.fwd2)$bic
##  [1] -14501.19 -18741.22 -22942.97 -23315.43 -23600.14 -23668.76 -23726.55
##  [8] -23753.88 -23789.16 -23798.07 -23792.41 -23782.14 -23771.85 -23762.06
## [15] -23751.64 -23741.21 -23730.77
nyc.bck = regsubsets(lprice~neighbourhood_group + room_type + neighbourhood_group:room_type + minimum_nights + number_of_reviews + calculated_host_listings_count + minimum_nights:number_of_reviews + minimum_nights:calculated_host_listings_count + number_of_reviews:calculated_host_listings_count, data=log.nyc.outliers, method="backward",nvmax=20)
summary(nyc.bck)$adjr2
##  [1] 0.3439458 0.4200794 0.4868078 0.4924656 0.4967807 0.4979197 0.4988987
##  [8] 0.4995849 0.5000920 0.5007380 0.5010242 0.5012598 0.5013315 0.5013690
## [15] 0.5013571 0.5013494 0.5013390 0.5013248 0.5013105 0.5012962
summary(nyc.bck)$rss
##  [1] 9951.173 8796.107 7783.759 7697.722 7632.054 7614.557 7599.489
##  [8] 7588.863 7580.952 7570.937 7566.376 7562.584 7561.278 7560.489
## [15] 7560.451 7560.347 7560.286 7560.281 7560.278 7560.276
summary(nyc.bck)$bic
##  [1] -14501.19 -18741.22 -22942.97 -23315.43 -23600.14 -23668.76 -23726.55
##  [8] -23764.31 -23789.79 -23824.89 -23835.20 -23842.02 -23837.52 -23830.67
## [15] -23820.40 -23810.42 -23800.25 -23789.83 -23779.40 -23768.96
nyc.exh = regsubsets(lprice~neighbourhood_group + room_type + neighbourhood_group:room_type + minimum_nights + number_of_reviews + calculated_host_listings_count + minimum_nights:number_of_reviews + minimum_nights:calculated_host_listings_count + number_of_reviews:calculated_host_listings_count, data=log.nyc.outliers, method="exhaustive",nvmax=20)
summary(nyc.exh)$adjr2
##  [1] 0.3439458 0.4200794 0.4868078 0.4924656 0.4967807 0.4979197 0.4988987
##  [8] 0.4995849 0.5000920 0.5007380 0.5010242 0.5012598 0.5013315 0.5013690
## [15] 0.5013571 0.5013494 0.5013390 0.5013248 0.5013105 0.5012962
summary(nyc.exh)$rss
##  [1] 9951.173 8796.107 7783.759 7697.722 7632.054 7614.557 7599.489
##  [8] 7588.863 7580.952 7570.937 7566.376 7562.584 7561.278 7560.489
## [15] 7560.451 7560.347 7560.286 7560.281 7560.278 7560.276
summary(nyc.exh)$bic
##  [1] -14501.19 -18741.22 -22942.97 -23315.43 -23600.14 -23668.76 -23726.55
##  [8] -23764.31 -23789.79 -23824.89 -23835.20 -23842.02 -23837.52 -23830.67
## [15] -23820.40 -23810.42 -23800.25 -23789.83 -23779.40 -23768.96
nyc.seq = regsubsets(lprice~neighbourhood_group + room_type + neighbourhood_group:room_type + minimum_nights + number_of_reviews + calculated_host_listings_count + minimum_nights:number_of_reviews + minimum_nights:calculated_host_listings_count + number_of_reviews:calculated_host_listings_count, data=log.nyc.outliers, method="seqrep",nvmax=20)
summary(nyc.exh)$adjr2
##  [1] 0.3439458 0.4200794 0.4868078 0.4924656 0.4967807 0.4979197 0.4988987
##  [8] 0.4995849 0.5000920 0.5007380 0.5010242 0.5012598 0.5013315 0.5013690
## [15] 0.5013571 0.5013494 0.5013390 0.5013248 0.5013105 0.5012962
summary(nyc.exh)$rss
##  [1] 9951.173 8796.107 7783.759 7697.722 7632.054 7614.557 7599.489
##  [8] 7588.863 7580.952 7570.937 7566.376 7562.584 7561.278 7560.489
## [15] 7560.451 7560.347 7560.286 7560.281 7560.278 7560.276
summary(nyc.exh)$bic
##  [1] -14501.19 -18741.22 -22942.97 -23315.43 -23600.14 -23668.76 -23726.55
##  [8] -23764.31 -23789.79 -23824.89 -23835.20 -23842.02 -23837.52 -23830.67
## [15] -23820.40 -23810.42 -23800.25 -23789.83 -23779.40 -23768.96

Assumptions Check

  • Risduals are normally distributed
    • Log-linear model has closest to normally distrubuted residuals from last plot
    • Near normal distriution of residuals
    • Envoking Central Limit Theorum due to such a large sample size
  • Constant variance
    • Near normal QQ-plot
par(mfrow=c(2,2))
plot(log.nyc.outliers.model)

  • Independence
    • Assumed
  • Multicollinearity
    • Confirmed with VIFs and pairs plot that there is no mjultcollinearity occuring
vif(log.depend.model)[,3]^2
##            neighbourhood_group                      room_type 
##                       1.011167                       1.018374 
##                 minimum_nights              number_of_reviews 
##                       1.029835                       1.014416 
## calculated_host_listings_count 
##                       1.031679
pairs(log.nyc.outliers, col=log.nyc.outliers$neighbourhood_group) #Color by neighborhood

  • Removed all prices over 400 to help reduce data set and uncover any correlation that was present between the dependent varaibles and independent categorical variables
#log.nyc.outliers <- log.nyc[!(log.nyc$lprice > 400),]

MLR May Not Be The Best

  • Multiple linear regression is just one option in building a predictive model for a continuous response
  • We are seeing it as a bad option because
    • The true relationship between the response and predictors is NOT “linear”. The relationships are complex.
      • We have gotten close, but we have worked extremely hard in specifying our model and manipulating the raw data to surface a linear relationship
      • This makes the interpretation into the real world application difficult to interpret
    • Since the above is true and our data is very large, we think that other methods such as Random Forest or K-NN would perform better.
      • These options are less time consuming because the model complexity is built into the lagorithm
      • We also do not have to specify how a relationship exists ahead of time
  • Since we see a strong relationship between the categorical variables, we move forward with a Two-Way ANOVA model to create a model way predict the price of a NYC AirBnB.